Extracting hourly temperature data from NOAA ISD (integrated surface database) weather data
ish_parser python module is from: https://github.com/haydenth/ish_parser
In [1]:
# boilerplate includes
import sys
import os
import numpy as np
import matplotlib as mpl
#mpl.use('nbagg')
import matplotlib.pyplot as plt
#from mpl_toolkits.mplot3d import Axes3D
#import mpld3 # for outputting interactive html figures
import pandas as pd
import seaborn as sns
import ish_parser
import gzip
import ftplib
import io
from IPython.display import display, HTML
%matplotlib notebook
plt.style.use('seaborn-notebook')
pd.set_option('display.max_columns', None)
In [11]:
# PARAMETERS (might be overridden by a calling script)
# if not calling from another script (batch), SUBNOTEBOOK_FLAG might not be defined
try:
SUBNOTEBOOK_FLAG
except NameError:
SUBNOTEBOOK_FLAG = False
# Not calling as a sub-script? define params here
if not SUBNOTEBOOK_FLAG:
# SET PARAMETER VARIABLES HERE UNLESS CALLING USING %run FROM ANOTHER NOTEBOOK
STATION_CALLSIGN = 'KLAX'
USE_CACHED_STATION_H5_FILES = True
SUPPRESS_FIGURE_DISPLAY = False
DATADIR = '../data/temperatures/ISD'
OUTDIR = '../data/temperatures'
FTPHOST = 'ftp.ncdc.noaa.gov'
FETCH_STATIONS_LIST_FILE = True
print("Fetching and parsing ",STATION_CALLSIGN)
Could either do it by hand, or else try to get all the data associated with a single station callsign. The latter seems like a cooler way to go... but have to be careful that the stations really are the same and the data is comparable for our purposes.
stations list: ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-history.txt
In [12]:
if FETCH_STATIONS_LIST_FILE:
# fetch a fresh copy of the stations list
with open(os.path.join(DATADIR,'isd-history.txt'),'wb') as fh:
with ftplib.FTP(host=FTPHOST) as ftpconn:
ftpconn.login()
ftpconn.retrbinary('RETR '+'/pub/data/noaa/isd-history.txt', fh.write)
ftpconn.close()
In [13]:
# function to parse stations list file
def read_isd_history_stations_list(filename, skiprows=22):
"""Read and parse stations information from isd_history.txt file"""
fwfdef = (( ('USAF', (6, str)),
('WBAN', (5, str)),
('STATION NAME', (28, str)),
('CTRY', (4, str)),
('ST', (2, str)),
('CALL', (5, str)),
('LAT', (7, str)),
('LON', (8, str)),
('EVEV', (7, str)),
('BEGIN', (8, str)),
('END', (8, str)),
))
names = []
colspecs = []
converters = {}
i = 0
for k,v in fwfdef:
names.append(k)
colspecs.append((i, i+v[0]+1))
i += v[0]+1
converters[k] = v[1]
stdf = pd.read_fwf(filename, skiprows=skiprows,
names=names,
colspecs=colspecs,
converters=converters)
return stdf
In [14]:
# actually parse the file
stationsdf = read_isd_history_stations_list(
os.path.join(DATADIR,'isd-history.txt'))
In [15]:
# pick just the info associated with the station we want
station_info = stationsdf[stationsdf['CALL'] == STATION_CALLSIGN]
station_info
Out[15]:
In [16]:
# # maybe only use a subset of these entires
# station_info = station_info.iloc[2:3]
# station_info
In [17]:
def download_ish_data(usaf_id, wban_id, years_to_get,
ftp_host=FTPHOST,
verbose=True):
parser = ish_parser.ish_parser()
with ftplib.FTP(host=ftp_host) as ftpconn:
ftpconn.login()
for year in years_to_get:
ftp_file = "/pub/data/noaa/{YEAR}/{USAF}-{WBAN}-{YEAR}.gz".format(
USAF=usaf_id, WBAN=wban_id, YEAR=year)
if verbose:
print(ftp_file)
# read the whole file and save it to a BytesIO (stream)
response = io.BytesIO()
try:
ftpconn.retrbinary('RETR '+ftp_file, response.write)
except ftplib.error_perm as err:
if str(err).startswith('550 '):
print('ERROR:', err)
else:
raise
# decompress and parse each line
response.seek(0) # jump back to the beginning of the stream
with gzip.open(response, mode='rb') as gzstream:
for line in gzstream:
parser.loads(line.decode('latin-1'))
# get the list of all reports
reports = parser.get_reports()
if verbose:
print(len(reports), "records")
# just return None if no records were found
if len(reports) <= 0:
return None
# convert to a pandas dataframe
foo = pd.DataFrame.from_records(
((r.datetime, r.air_temperature.get_numeric()) for r in reports),
columns=['datetime','AT'],
index='datetime')
foo.index = pd.to_datetime(foo.index) # convert the index to pandas datetime objects
foo.dropna(inplace=True) # drop entires which don't have an AT value
foo.sort_index(inplace=True) # go ahead and ensure it is sorted
return foo
In [18]:
%%time
df = None
for _,row in station_info.iterrows():
usaf_id = row['USAF']
wban_id = row['WBAN']
years_to_get = range(int(row['BEGIN'][0:4]), int(row['END'][0:4])+1)
print('####', usaf_id, wban_id, years_to_get)
station_h5file = os.path.join(DATADIR,
"{USAF}-{WBAN}-AT.h5".format(USAF=usaf_id, WBAN=wban_id))
station_df = None
if USE_CACHED_STATION_H5_FILES:
if os.path.isfile(station_h5file):
print("Using cached file: '{}'".format(station_h5file))
station_df = pd.read_hdf(station_h5file, 'table')
if station_df is None:
station_df = download_ish_data(usaf_id, wban_id, years_to_get, ftp_host=FTPHOST)
if station_df is None:
print("WARNING: No data found for {} {} {}".format(usaf_id, wban_id, years_to_get))
else:
# Save this station's individual data
print("Saving station data to: '{}'".format(station_h5file))
station_df.to_hdf(station_h5file,'table')
# Combine into single dataset
if df is None:
df = station_df.copy(deep=True)
else:
# @TCC TODO: Maybe use some more clever logic than just "combine_first"
df = df.combine_first(station_df)
# ensure the final combined dataset is sorted
df.sort_index(inplace=True)
# save the combined datafram
combined_AT_filename = "{}_AT.h5".format(STATION_CALLSIGN)
print("Saving combined data to: '{}'".format(combined_AT_filename))
df.to_hdf(os.path.join(DATADIR, combined_AT_filename),'table')
In [ ]:
In [20]:
# # Plot (decomment to enable)
# if SUPPRESS_FIGURE_DISPLAY:
# plt.ioff()
# ax = df.plot(title=STATION_CALLSIGN, marker='.')
# ax.set_ylabel('air temperature [$\degree$ C]')
# plt.savefig(os.path.join(OUTDIR,'{}_AT_orig.png'.format(STATION_CALLSIGN)))
# plt.ion()
In [ ]:
In [ ]:
## Distribution plot (decomment to enable)
# fig = plt.figure()
# ax = fig.add_subplot(1,1,1)
# sns.kdeplot(df['AT'], bw=.5, ax=ax, legend=False)
# ax.set_xlabel('air temperature [$\degree$C]')
# ax.set_ylabel('proportion of readings')
In [ ]:
In [ ]:
In [ ]: